In [47]:
import numpy as np
from numpy.linalg import svd, matrix_rank
import pandas as pd
import matplotlib.pyplot as plt
from IPython import get_ipython
from util import (
svdcomp,
nextplot,
plot_matrix,
plot_xy,
plot_cov,
match_categories,
) # see util.py
from sklearn.cluster import KMeans
# setup plotting
import psutil
inTerminal = not "IPKernelApp" in get_ipython().config
inJupyterNb = any(filter(lambda x: x.endswith("jupyter-notebook"), psutil.Process().parent().cmdline()))
inJupyterLab = any(filter(lambda x: x.endswith("jupyter-lab"), psutil.Process().parent().cmdline()))
if not inJupyterLab:
from IPython import get_ipython
get_ipython().run_line_magic("matplotlib", "" if inTerminal else "notebook" if inJupyterNb else "widget")
1 Intuition on SVD¶
picture is from Wikipedia¶
In [48]:
M1 = np.array(
[
[1, 1, 1, 0, 0],
[1, 1, 1, 0, 0],
[1, 1, 1, 0, 0],
[0, 0, 0, 0, 0],
[0, 0, 0, 0, 0],
]
)
#m1_u = np.array([[1/np.sqrt(3)], [1/np.sqrt(3)], [1/np.sqrt(3)], [0], [0]])
#m1_vt = np.array([1/np.sqrt(3), 1/np.sqrt(3), 1/np.sqrt(3), 0, 0]).reshape(1, 5)
#m1_singular = 3
#m1_construct = 3 * (m1_u @ m1_vt)
#rank=1
m1_u = 1/np.sqrt(3)*np.array([1, 1, 1, 0, 0])
m1_vt = 1/np.sqrt(3)*np.array([1, 1, 1, 0, 0])
m1_singular = 3
m1_construct = m1_singular * (np.outer(m1_u, m1_vt))
m1_construct
Out[48]:
array([[1., 1., 1., 0., 0.],
[1., 1., 1., 0., 0.],
[1., 1., 1., 0., 0.],
[0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0.]])
In [49]:
M2 = np.array(
[
[0, 0, 0, 0, 0],
[0, 2, 1, 2, 0],
[0, 2, 1, 2, 0],
[0, 2, 1, 2, 0],
[0, 0, 0, 0, 0],
]
)
#rank=1
m2_u = 1/np.sqrt(3) * np.array([0, 1, 1, 1, 0])
m2_vt = 1/3 * np.array([0, 2, 1, 2, 0])
m2_singular = np.sqrt(27)
m2_construct = m2_singular * (np.outer(m2_u, m2_vt))
m2_construct
Out[49]:
array([[0., 0., 0., 0., 0.],
[0., 2., 1., 2., 0.],
[0., 2., 1., 2., 0.],
[0., 2., 1., 2., 0.],
[0., 0., 0., 0., 0.]])
In [50]:
M3 = np.array([[0, 0, 0, 0],
[0, 1, 1, 1],
[0, 1, 1, 1],
[0, 1, 1, 1],
[0, 1, 1, 1]])
#rank=1
m3_u = 1/2 * np.array([0, 1, 1, 1, 1])
m3_vt = 1/np.sqrt(3) * np.array([0, 1, 1, 1])
m3_singular = 2*np.sqrt(3)
m3_construct = m3_singular * (np.outer(m3_u, m3_vt))
m3_construct
Out[50]:
array([[0., 0., 0., 0.],
[0., 1., 1., 1.],
[0., 1., 1., 1.],
[0., 1., 1., 1.],
[0., 1., 1., 1.]])
In [51]:
M4 = np.array(
[
[1, 1, 1, 0, 0],
[1, 1, 1, 0, 0],
[1, 1, 1, 0, 0],
[0, 0, 0, 1, 1],
[0, 0, 0, 1, 1],
]
)
# Rank 2
m4_u1 = 1/np.sqrt(3) * np.array([1, 1, 1, 0, 0])
m4_u2 = 1/np.sqrt(2) * np.array([0, 0, 0, 1, 1])
m4_vt1 = 1/np.sqrt(3) * np.array([1, 1, 1, 0, 0])
m4_vt2 = 1/np.sqrt(2) * np.array([0, 0, 0, 1, 1])
m4_singular1 = np.sqrt(3*3)
m4_singular2 = np.sqrt(2*2)
m4_construct = m4_singular1 * np.outer(m4_u1, m4_vt1) + m4_singular2 * np.outer(m4_u2, m4_vt2)
m4_construct
Out[51]:
array([[1., 1., 1., 0., 0.],
[1., 1., 1., 0., 0.],
[1., 1., 1., 0., 0.],
[0., 0., 0., 1., 1.],
[0., 0., 0., 1., 1.]])
In [52]:
M5 = np.array([
[1, 1, 1, 0, 0],
[1, 1, 1, 0, 0],
[1, 1, 1, 1, 1],
[0, 0, 1, 1, 1],
[0, 0, 1, 1, 1]
])
#couldn't solve just by looking. rank=3, there will be three non-zero singular values.
In [53]:
M6 = np.array(
[
[1, 1, 1, 1, 1],
[1, 1, 1, 1, 1],
[1, 1, 0, 1, 1],
[1, 1, 1, 1, 1],
[1, 1, 1, 1, 1],
]
)
#couldn't solve just by looking. rank=2, there will be two non-zero singular values.
1b¶
In [54]:
# YOUR PART
U, s, Vt = svd(M1)
print(f"Rank(M1): {np.linalg.matrix_rank(M1)}")
print('************************************************')
print('Numpy U: ', U[:, 0])
print('************************************************')
print('Numpy Sigma: ', np.round(s[0]))
print('************************************************')
print('Numpy Vt: ', Vt[0,:])
print('************************************************')
print('MY RESULTS: ')
print('My U: ', m1_u)
print('************************************************')
print('My Sigma: ', m1_singular)
print('************************************************')
print('My Vt: ', m1_vt)
print('************************************************')
print('M1 matrix:\n', M1)
print('My reconstructed matrix:\n', m1_construct)
Rank(M1): 1 ************************************************ Numpy U: [-0.57735027 -0.57735027 -0.57735027 0. 0. ] ************************************************ Numpy Sigma: 3.0 ************************************************ Numpy Vt: [-0.57735027 -0.57735027 -0.57735027 -0. -0. ] ************************************************ MY RESULTS: My U: [0.57735027 0.57735027 0.57735027 0. 0. ] ************************************************ My Sigma: 3 ************************************************ My Vt: [0.57735027 0.57735027 0.57735027 0. 0. ] ************************************************ M1 matrix: [[1 1 1 0 0] [1 1 1 0 0] [1 1 1 0 0] [0 0 0 0 0] [0 0 0 0 0]] My reconstructed matrix: [[1. 1. 1. 0. 0.] [1. 1. 1. 0. 0.] [1. 1. 1. 0. 0.] [0. 0. 0. 0. 0.] [0. 0. 0. 0. 0.]]
In [55]:
U, s, Vt = svd(M2)
print(f"Rank(M2): {np.linalg.matrix_rank(M2)}")
print('************************************************')
print('Numpy U: ', U[:, 0])
print('************************************************')
print('Numpy Sigma: ', np.round(s[0], 5))
print('************************************************')
print('Numpy Vt: ', Vt[0,:])
print('************************************************')
print('MY RESULTS: ')
print('My U: ', m2_u)
print('************************************************')
print('My Sigma: ', m2_singular)
print('************************************************')
print('My Vt: ', m2_vt)
print('************************************************')
print('M2 matrix:\n', M2)
print('My reconstructed matrix:\n', m2_construct)
Rank(M2): 1 ************************************************ Numpy U: [ 0. -0.57735027 -0.57735027 -0.57735027 0. ] ************************************************ Numpy Sigma: 5.19615 ************************************************ Numpy Vt: [-0. -0.66666667 -0.33333333 -0.66666667 -0. ] ************************************************ MY RESULTS: My U: [0. 0.57735027 0.57735027 0.57735027 0. ] ************************************************ My Sigma: 5.196152422706632 ************************************************ My Vt: [0. 0.66666667 0.33333333 0.66666667 0. ] ************************************************ M2 matrix: [[0 0 0 0 0] [0 2 1 2 0] [0 2 1 2 0] [0 2 1 2 0] [0 0 0 0 0]] My reconstructed matrix: [[0. 0. 0. 0. 0.] [0. 2. 1. 2. 0.] [0. 2. 1. 2. 0.] [0. 2. 1. 2. 0.] [0. 0. 0. 0. 0.]]
In [56]:
U, s, Vt = svd(M3)
print(f"Rank(M3): {np.linalg.matrix_rank(M3)}")
print('************************************************')
print('Numpy U: ', U[:, 0])
print('************************************************')
print('Numpy Sigma: ', np.round(s[0], 5))
print('************************************************')
print('Numpy Vt: ', Vt[0,:])
print('************************************************')
print('MY RESULTS: ')
print('My U: ', m3_u)
print('************************************************')
print('My Sigma: ', m3_singular)
print('************************************************')
print('My Vt: ', m3_vt)
print('************************************************')
print('M3 matrix:\n', M3)
print('My reconstructed matrix:\n', m3_construct)
Rank(M3): 1 ************************************************ Numpy U: [ 0. -0.5 -0.5 -0.5 -0.5] ************************************************ Numpy Sigma: 3.4641 ************************************************ Numpy Vt: [-0. -0.57735027 -0.57735027 -0.57735027] ************************************************ MY RESULTS: My U: [0. 0.5 0.5 0.5 0.5] ************************************************ My Sigma: 3.4641016151377544 ************************************************ My Vt: [0. 0.57735027 0.57735027 0.57735027] ************************************************ M3 matrix: [[0 0 0 0] [0 1 1 1] [0 1 1 1] [0 1 1 1] [0 1 1 1]] My reconstructed matrix: [[0. 0. 0. 0.] [0. 1. 1. 1.] [0. 1. 1. 1.] [0. 1. 1. 1.] [0. 1. 1. 1.]]
In [57]:
U, s, Vt = svd(M4)
print(f"Rank(M4): {np.linalg.matrix_rank(M4)}")
print('************************************************')
print('Numpy U1: ', U[:, 0])
print('Numpy U2: ', U[:, 1])
print('************************************************')
print('Numpy Sigma1: ', np.round(s[0], 5))
print('Numpy Sigma2: ', np.round(s[1], 5))
print('************************************************')
print('Numpy Vt1: ', Vt[0, :])
print('Numpy Vt2: ', Vt[1, :])
print('************************************************')
print('MY RESULTS: ')
print('My U1: ', m4_u1)
print('My U2: ', m4_u2)
print('************************************************')
print('My Sigma1: ', m4_singular1)
print('My Sigma2: ', m4_singular2)
print('************************************************')
print('My Vt1: ', m4_vt1)
print('My Vt2: ', m4_vt2)
print('************************************************')
print('M4 matrix:\n', M4)
print('My reconstructed matrix:\n', m4_construct)
Rank(M4): 2 ************************************************ Numpy U1: [-0.57735027 -0.57735027 -0.57735027 0. 0. ] Numpy U2: [ 0. 0. 0. -0.70710678 -0.70710678] ************************************************ Numpy Sigma1: 3.0 Numpy Sigma2: 2.0 ************************************************ Numpy Vt1: [-0.57735027 -0.57735027 -0.57735027 -0. -0. ] Numpy Vt2: [-0. -0. -0. -0.70710678 -0.70710678] ************************************************ MY RESULTS: My U1: [0.57735027 0.57735027 0.57735027 0. 0. ] My U2: [0. 0. 0. 0.70710678 0.70710678] ************************************************ My Sigma1: 3.0 My Sigma2: 2.0 ************************************************ My Vt1: [0.57735027 0.57735027 0.57735027 0. 0. ] My Vt2: [0. 0. 0. 0.70710678 0.70710678] ************************************************ M4 matrix: [[1 1 1 0 0] [1 1 1 0 0] [1 1 1 0 0] [0 0 0 1 1] [0 0 0 1 1]] My reconstructed matrix: [[1. 1. 1. 0. 0.] [1. 1. 1. 0. 0.] [1. 1. 1. 0. 0.] [0. 0. 0. 1. 1.] [0. 0. 0. 1. 1.]]
In [58]:
# YOUR PART
U, s, Vt = svd(M5)
print(f"Rank(M5): {np.linalg.matrix_rank(M5)}")
print('************************************************')
print('Numpy U1: ', U[:, 0])
print('Numpy U2: ', np.round(U[:, 1], 4))
print('Numpy U3: ', U[:, 2])
print('************************************************')
print('Numpy Sigma(1,2,3): ', np.round(s[:3], 3))
print('************************************************')
print('Numpy Vt1: ', Vt[0, :])
print('Numpy Vt2: ', np.round(Vt[1, :], 4))
print('Numpy Vt3: ', Vt[2, :])
print('************************************************')
print("SVD reconstructed: ")
print(np.round(svdcomp(M5, range(3)), 5))
print('Original matrix: ')
print(M5)
Rank(M5): 3 ************************************************ Numpy U1: [-0.39410272 -0.39410272 -0.61541221 -0.39410272 -0.39410272] Numpy U2: [-0.5 -0.5 -0. 0.5 0.5] Numpy U3: [ 0.3077061 0.3077061 -0.78820544 0.3077061 0.3077061 ] ************************************************ Numpy Sigma(1,2,3): [3.562 2. 0.562] ************************************************ Numpy Vt1: [-0.39410272 -0.39410272 -0.61541221 -0.39410272 -0.39410272] Numpy Vt2: [-0.5 -0.5 -0. 0.5 0.5] Numpy Vt3: [-0.3077061 -0.3077061 0.78820544 -0.3077061 -0.3077061 ] ************************************************ SVD reconstructed: [[ 1. 1. 1. -0. -0.] [ 1. 1. 1. -0. -0.] [ 1. 1. 1. 1. 1.] [-0. -0. 1. 1. 1.] [-0. -0. 1. 1. 1.]] Original matrix: [[1 1 1 0 0] [1 1 1 0 0] [1 1 1 1 1] [0 0 1 1 1] [0 0 1 1 1]]
In [59]:
U, s, Vt = svd(M6)
print(f"Rank(M6): {np.linalg.matrix_rank(M6)}")
print('************************************************')
print('Numpy U1: ', U[:, 0])
print('Numpy U2: ', np.round(U[:, 1], 4))
print('************************************************')
print('Numpy Sigma(1,2): ', np.round(s[:2], 3))
print('************************************************')
print('Numpy Vt1: ', Vt[0, :])
print('Numpy Vt2: ', np.round(Vt[1, :], 4))
print('************************************************')
print("SVD reconstructed: ")
print(np.round(svdcomp(M6, range(2)), 5))
print('Original matrix: ')
print(M6)
Rank(M6): 2 ************************************************ Numpy U1: [-0.46193977 -0.46193977 -0.38268343 -0.46193977 -0.46193977] Numpy U2: [-0.1913 -0.1913 0.9239 -0.1913 -0.1913] ************************************************ Numpy Sigma(1,2): [4.828 0.828] ************************************************ Numpy Vt1: [-0.46193977 -0.46193977 -0.38268343 -0.46193977 -0.46193977] Numpy Vt2: [ 0.1913 0.1913 -0.9239 0.1913 0.1913] ************************************************ SVD reconstructed: [[ 1. 1. 1. 1. 1.] [ 1. 1. 1. 1. 1.] [ 1. 1. -0. 1. 1.] [ 1. 1. 1. 1. 1.] [ 1. 1. 1. 1. 1.]] Original matrix: [[1 1 1 1 1] [1 1 1 1 1] [1 1 0 1 1] [1 1 1 1 1] [1 1 1 1 1]]
1c¶
In [60]:
# You can use the functions svdcomp and plot_matrix from util.py
# YOUR PART
matrices = {'M1': M1, 'M2': M2, 'M3': M3, 'M4': M4, 'M5': M5, 'M6': M6}
for matrix_n, matrix in matrices.items():
print(f'{matrix_n} Reconstructed: ')
plot_matrix(svdcomp(matrix, range(1)))
print(f'{matrix_n} Original: ')
plot_matrix(matrix)
print('************************************************************')
print('************************************************************')
M1 Reconstructed:
M1 Original:
************************************************************ ************************************************************ M2 Reconstructed:
M2 Original:
************************************************************ ************************************************************ M3 Reconstructed:
M3 Original:
/Users/cagandeliktas/Desktop/Mannheim Uni Master/Machine Learning/homeworks/ML_Assignments/assignment_3/util.py:33: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`). Consider using `matplotlib.pyplot.close()`. plt.figure() # this creates a new plot
************************************************************ ************************************************************ M4 Reconstructed:
M4 Original:
************************************************************ ************************************************************ M5 Reconstructed:
M5 Original:
************************************************************ ************************************************************ M6 Reconstructed:
M6 Original:
************************************************************ ************************************************************
1d¶
In [62]:
# Another method to compute the rank is matrix_rank.
# YOUR PART
U, s, Vt = svd(M6)
print(f"Rank(M6): {np.linalg.matrix_rank(M6)}")
print('************************************************')
print('Numpy Sigma (rounded): ', np.round(s, 5))
print('Numpy Sigma: ', s)
print('************************************************')
print("SVD reconstructed: ")
print(np.round(svdcomp(M6, range(2)), 5))
print('Original matrix: ')
print(M6)
Rank(M6): 2 ************************************************ Numpy Sigma (rounded): [4.82843 0.82843 0. 0. 0. ] Numpy Sigma: [4.82842712e+00 8.28427125e-01 9.95090019e-17 2.18529703e-17 5.31822283e-50] ************************************************ SVD reconstructed: [[ 1. 1. 1. 1. 1.] [ 1. 1. 1. 1. 1.] [ 1. 1. -0. 1. 1.] [ 1. 1. 1. 1. 1.] [ 1. 1. 1. 1. 1.]] Original matrix: [[1 1 1 1 1] [1 1 1 1 1] [1 1 0 1 1] [1 1 1 1 1] [1 1 1 1 1]]
In [63]:
s > 0
Out[63]:
array([ True, True, True, True, True])
numpy didn't return any nonzero singular values, but the last three singular values are really small: 9.95090019e-17, 2.18529703e-17, 5.31822283e-50. we should consider them as zeros. So, we rounded the singular values up to 5 decimals.¶
2 The SVD on Weather Data¶
In [64]:
# Load the data
climate = pd.read_csv("data/worldclim.csv")
coord = pd.read_csv("data/worldclim_coordinates.csv")
lon = coord["lon"]
lat = coord["lat"]
In [65]:
# Plot the coordinates
plot_xy(lon, lat)
2a¶
In [66]:
climate.describe().iloc[[3, 7],:] #data has to be normalized because the ranges are different.
Out[66]:
| min1 | min2 | min3 | min4 | min5 | min6 | min7 | min8 | min9 | min10 | ... | rain3 | rain4 | rain5 | rain6 | rain7 | rain8 | rain9 | rain10 | rain11 | rain12 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| min | -23.2 | -23.6 | -22.5 | -19.6 | -11.4 | -5.5 | -2.1 | -2.6 | -6.7 | -12.8 | ... | 18.50 | 14.407 | 7.50 | 1.6667 | 0.00 | 0.00 | 8.25 | 19.517 | 20.815 | 20.00 |
| max | 11.9 | 11.3 | 11.8 | 13.3 | 16.4 | 20.9 | 23.1 | 23.8 | 22.2 | 19.0 | ... | 188.11 | 141.170 | 158.33 | 181.1700 | 173.75 | 186.67 | 278.32 | 310.710 | 268.890 | 302.04 |
2 rows × 48 columns
In [67]:
# YOUR PART
# Normalize the data to z-scores. Store the result in X.
def z_score_scaling(data):
data = np.array(data)
mean = np.mean(data, axis=0)
std_dev = np.std(data, axis=0)
z_scores = (data - mean) / std_dev
return z_scores
display(climate.head())
X = pd.DataFrame(z_score_scaling(climate), columns=climate.columns)
display(X.head())
display(X.describe().iloc[[3, 7],:])
| min1 | min2 | min3 | min4 | min5 | min6 | min7 | min8 | min9 | min10 | ... | rain3 | rain4 | rain5 | rain6 | rain7 | rain8 | rain9 | rain10 | rain11 | rain12 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 10.6 | 9.9 | 10.5 | 11.0 | 12.5 | 14.7 | 16.9 | 18.0 | 17.1 | 15.0 | ... | 103.00 | 74.000 | 66.000 | 53.000 | 41.000 | 57.000 | 92.000 | 118.0 | 126.00 | 126.0 |
| 1 | 8.3 | 7.6 | 8.1 | 8.6 | 10.2 | 12.4 | 14.7 | 15.9 | 14.8 | 12.7 | ... | 108.29 | 72.429 | 63.286 | 50.571 | 38.857 | 53.714 | 88.571 | 118.0 | 125.43 | 120.0 |
| 2 | 10.1 | 9.5 | 9.9 | 10.5 | 12.0 | 14.2 | 16.4 | 17.4 | 16.6 | 14.6 | ... | 119.00 | 75.000 | 60.400 | 48.400 | 38.400 | 50.400 | 84.400 | 120.4 | 127.00 | 117.8 |
| 3 | 10.2 | 9.7 | 9.9 | 10.7 | 12.2 | 14.3 | 16.4 | 17.4 | 16.6 | 14.7 | ... | 141.20 | 80.400 | 54.200 | 44.000 | 36.600 | 42.600 | 75.600 | 123.6 | 129.40 | 114.4 |
| 4 | 11.7 | 11.1 | 11.5 | 12.1 | 13.5 | 15.7 | 17.8 | 18.9 | 18.1 | 16.1 | ... | 119.50 | 74.000 | 58.500 | 47.500 | 38.000 | 48.500 | 83.500 | 121.0 | 125.00 | 116.0 |
5 rows × 48 columns
| min1 | min2 | min3 | min4 | min5 | min6 | min7 | min8 | min9 | min10 | ... | rain3 | rain4 | rain5 | rain6 | rain7 | rain8 | rain9 | rain10 | rain11 | rain12 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2.365957 | 2.200220 | 2.104881 | 1.861889 | 1.545324 | 1.280310 | 1.320737 | 1.604380 | 1.927607 | 2.212659 | ... | 1.721196 | 1.053074 | 0.339022 | -0.340384 | -0.661845 | -0.250235 | 0.818142 | 1.240162 | 1.432487 | 1.334708 |
| 1 | 2.011593 | 1.845426 | 1.694031 | 1.384333 | 1.014846 | 0.706090 | 0.769095 | 1.102594 | 1.416389 | 1.724273 | ... | 1.913735 | 0.975600 | 0.215637 | -0.429133 | -0.731620 | -0.357411 | 0.707674 | 1.240162 | 1.416073 | 1.179510 |
| 2 | 2.288921 | 2.138517 | 2.002169 | 1.762398 | 1.430003 | 1.155479 | 1.195364 | 1.461012 | 1.816472 | 2.127722 | ... | 2.303543 | 1.102389 | 0.084431 | -0.508457 | -0.746500 | -0.465501 | 0.573302 | 1.306088 | 1.461285 | 1.122603 |
| 3 | 2.304328 | 2.169369 | 2.002169 | 1.802194 | 1.476131 | 1.180445 | 1.195364 | 1.461012 | 1.816472 | 2.148956 | ... | 3.111550 | 1.368689 | -0.197438 | -0.669222 | -0.805107 | -0.719906 | 0.289803 | 1.393988 | 1.530399 | 1.034657 |
| 4 | 2.535435 | 2.385330 | 2.276069 | 2.080768 | 1.775967 | 1.529971 | 1.546408 | 1.819431 | 2.149875 | 2.446234 | ... | 2.321741 | 1.053074 | -0.001948 | -0.541340 | -0.759524 | -0.527471 | 0.544307 | 1.322569 | 1.403690 | 1.076044 |
5 rows × 48 columns
| min1 | min2 | min3 | min4 | min5 | min6 | min7 | min8 | min9 | min10 | ... | rain3 | rain4 | rain5 | rain6 | rain7 | rain8 | rain9 | rain10 | rain11 | rain12 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| min | -2.841644 | -2.967432 | -3.544305 | -4.226950 | -3.967037 | -3.762841 | -3.443443 | -3.317905 | -3.362382 | -3.690441 | ... | -1.354324 | -1.885748 | -2.320548 | -2.215978 | -1.996789 | -2.10935 | -1.879931 | -1.465064 | -1.596571 | -1.407138 |
| max | 2.566249 | 2.416182 | 2.327425 | 2.319546 | 2.444831 | 2.828207 | 2.875364 | 2.990266 | 3.061175 | 3.062025 | ... | 4.818918 | 4.365554 | 4.536598 | 4.342636 | 3.660442 | 3.97909 | 6.820588 | 6.533706 | 5.547354 | 5.888242 |
2 rows × 48 columns
In [68]:
# Plot histograms of attributes
#nextplot()
#X.hist(ax=plt.gca())
2b¶
In [69]:
# Compute the SVD of the normalized climate data and store it in variables U,s,Vt. What
# is the rank of the data?
# YOUR PART
U, s, Vt = np.linalg.svd(X)
rank = np.sum(s > 1e-10)
print(f"Rank of the normalized climate data: {rank}")
print(f"Rank of the normalized climate data (numpy): {np.linalg.matrix_rank(X)}")
print("Singular values:")
print(s)
Rank of the normalized climate data: 48 Rank of the normalized climate data (numpy): 48 Singular values: [2.90222389e+02 1.50668824e+02 8.84936404e+01 5.91859882e+01 5.21202132e+01 2.74621244e+01 2.21341436e+01 1.52406513e+01 1.41321813e+01 1.20289877e+01 1.14767046e+01 1.09209834e+01 9.14704009e+00 8.39692373e+00 7.93211636e+00 7.06774614e+00 6.74240524e+00 6.51838587e+00 5.76805648e+00 5.39678641e+00 5.06878890e+00 4.21038123e+00 3.88507570e+00 3.37992885e+00 3.12011424e+00 2.88184606e+00 2.53346089e+00 2.48165895e+00 2.32967767e+00 2.07730350e+00 1.90548668e+00 1.86392296e+00 1.72330112e+00 1.60158454e+00 1.28336386e+00 1.12607554e+00 1.04958416e+00 9.84527428e-01 8.39920385e-01 6.56703416e-01 4.94031887e-01 4.13336481e-01 3.78242622e-01 3.47242119e-01 3.20887328e-01 3.06204289e-01 3.00736590e-01 2.55298845e-01]
2c¶
In [70]:
# For interpretation, it may also help to look at the other component matrices and
# perhaps use other plot functions (e.g., plot_matrix).
# YOUR PART
num_columns = 5
# Plot each of the first 5 columns of U
for i in range(num_columns):
print(f'Left singuler vector {i+1} ')
plot_xy(lon, lat, U[:, i])
Left singuler vector 1
Left singuler vector 2
Left singuler vector 3
Left singuler vector 4
Left singuler vector 5
2d¶
"When ( u[:,0] ) is on the x-axis, we can observe that locations with higher latitudes (further north) appear on the right side of the plot, colored in red, while lower latitudes (closer to the south) appear on the left side, colored in blue. This indicates that ( u[:,0] ) corresponds to the north-south direction, with positive values of ( u[:,0] ) representing regions with higher latitudes, closer to the North Pole."¶
In [71]:
num_columns = 5
lat_centered = lat - np.mean(lat)
lon_centered = lon - np.mean(lon)
fig, axes = plt.subplots(5, 2, figsize=(15, 20))
fig.suptitle("Plots of U Matrix Columns with Latitude Centered Coloring", fontsize=16)
count = 0
for i in range(num_columns - 1):
for j in range(i + 1, num_columns):
row, col = divmod(count, 2)
ax = axes[row, col]
scatter_lat = ax.scatter(U[:, i], U[:, j], c=lat_centered, cmap='coolwarm')
ax.set_xlabel(f'U[:, {i}]')
ax.set_ylabel(f'U[:, {j}]')
ax.set_title(f'U[:, {i}] vs U[:, {j}] (Lat-centered)', fontsize=10)
fig.colorbar(scatter_lat, ax=ax, orientation="vertical", label="Latitude (centered)")
count += 1
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()
Lower values of u[:, 1] correspond to higher longitudes (further east, represented by yellow on the legend). As the values of u[:, 1] increase, the locations represent lower longitudes (further west, represented by green-blue on the legend).¶
In [72]:
fig_lon, axes_lon = plt.subplots(5, 2, figsize=(15, 20))
fig_lon.suptitle("Plots of U Matrix Columns with Longitude Centered Coloring", fontsize=16)
count = 0
for i in range(num_columns - 1):
for j in range(i + 1, num_columns):
row, col = divmod(count, 2)
ax = axes_lon[row, col]
scatter_lon = ax.scatter(U[:, i], U[:, j], c=lon_centered, cmap='viridis')
ax.set_xlabel(f'U[:, {i}]')
ax.set_ylabel(f'U[:, {j}]')
ax.set_title(f'U[:, {i}] vs U[:, {j}] (Lon-centered)', fontsize=10)
fig_lon.colorbar(scatter_lon, ax=ax, orientation="vertical", label="Longitude (centered)")
count += 1
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
2e¶
In [73]:
# 2e(i) Guttman-Kaiser
# YOUR PART
print('index of the last singular value which is greater than 1: ', sum(s >= 1)-1)
index of the last singular value which is greater than 1: 36
In [74]:
print(f'k={sum(s >= 1)}') #37 singular values, k=37
k=37
In [75]:
k_gutmann = s[:sum(s >= 1)]
k_gutmann
Out[75]:
array([290.22238931, 150.66882365, 88.49364039, 59.18598821,
52.12021318, 27.46212444, 22.13414365, 15.24065125,
14.13218128, 12.02898772, 11.47670461, 10.92098344,
9.14704009, 8.39692373, 7.93211636, 7.06774614,
6.74240524, 6.51838587, 5.76805648, 5.39678641,
5.0687889 , 4.21038123, 3.8850757 , 3.37992885,
3.12011424, 2.88184606, 2.53346089, 2.48165895,
2.32967767, 2.0773035 , 1.90548668, 1.86392296,
1.72330112, 1.60158454, 1.28336386, 1.12607554,
1.04958416])
In [76]:
# 2e(ii) 90% squared Frobenius norm
# YOUR PART
def frobenius_norm(singular_values):
return np.sqrt(np.sum(singular_values**2))
def rank_90_percent_frobenius(S, threshold=0.9):
total_norm = frobenius_norm(S)
squared_norms = np.cumsum(S**2)
rank = np.argmax(squared_norms >= threshold * total_norm) + 1
return rank
print(f'k={rank_90_percent_frobenius(s)}')
print('Only selects the first singular value')
ninety_percent_fro_norm = s[rank_90_percent_frobenius(s)-1]
print('first singular value: ', ninety_percent_fro_norm)
k=1 Only selects the first singular value first singular value: 290.2223893074994
In [77]:
# 2e(iii) Scree test
# YOUR PART
def scree_test(S):
plt.figure(figsize=(8, 5))
plt.plot(S, marker='o', linestyle='--')
plt.title("Scree Plot")
plt.xlabel("Index")
plt.ylabel("Singular Value")
plt.show()
scree_test(s)
In [86]:
# 2e(iv) entropy
# YOUR PART
def entropy_based_k(S):
squared_singular_values = S ** 2
total_frobenius_norm = np.sum(squared_singular_values)
relative_contributions = squared_singular_values / total_frobenius_norm
entropy = -np.sum(relative_contributions * np.log(relative_contributions)) / np.log(len(relative_contributions))
print(f"Entropy of singular values: {entropy:.4f}")
cumulative_sum = np.cumsum(relative_contributions)
rank_k = np.argmax(cumulative_sum >= entropy) + 1
return rank_k
print(f'k={entropy_based_k(s)}')
Entropy of singular values: 0.2752 k=1
In [87]:
# 2e(v) random flips
# Random sign matrix: np.random.choice([-1,1], X.shape)
# YOUR PART
from numpy.linalg import norm
np.random.seed(15)
def random_flip_signs(matrix):
flip_matrix = np.random.choice([1, -1], size=matrix.shape)
return matrix * flip_matrix
def residual_matrix(U, S, Vt, k):
S_rest = S[k:]
U_residual = U[:, k:]
Vt_residual = Vt[k:, :]
S_residual = np.zeros((U_residual.shape[1], Vt_residual.shape[0]))
np.fill_diagonal(S_residual, S_rest)
return U_residual @ S_residual @ Vt_residual
def random_flip_rank_selection(A, max_rank=None):
U, S, Vt = svd(A)
max_rank = max_rank or min(A.shape)
best_rank = 1
min_difference = float("inf")
for k in range(1, max_rank + 1):
X_minus_k = residual_matrix(U, S, Vt, k)
X_minus_k_flipped = random_flip_signs(X_minus_k)
spectral_norm_diff = abs(norm(X_minus_k, ord=2) - norm(X_minus_k_flipped, ord=2))
frobenius_norm_residual = norm(X_minus_k, ord='fro')
if frobenius_norm_residual != 0:
normalized_difference = spectral_norm_diff / frobenius_norm_residual
else:
normalized_difference = float("inf")
if normalized_difference < min_difference:
min_difference = normalized_difference
best_rank = k
return best_rank
In [88]:
print(f'k={random_flip_rank_selection(X)}')
k=7
2f¶
In [89]:
# Here is the empty plot that you need to fill (one line per choice of k: RSME between
# original X and the reconstruction from size-k SVD of noisy versions)
# YOUR PART
In [90]:
def rmse(A, A_hat):
return np.sqrt(np.sum((A - A_hat)**2) / (A.shape[0] * A.shape[1]))
def add_noise(X, epsilon):
return np.array(X) + np.random.randn(*X.shape) * epsilon
def rank_k_approximation(X, k):
U, S, Vt = svd(X)
S_k = np.diag(S[:k])
U_k = U[:, :k]
Vt_k = Vt[:k, :]
return U_k @ S_k @ Vt_k
In [91]:
epsilons = np.linspace(0, 2, 21)
ranks = [1, 2, 5, 10, 48]
rmse_values = {k: [] for k in ranks}
In [92]:
for epsilon in epsilons:
X_noise = add_noise(X, epsilon)
for k in ranks:
X_k = rank_k_approximation(X_noise, k)
rmse_value = rmse(np.array(X), X_k)
rmse_values[k].append(rmse_value)
In [93]:
plt.figure(figsize=(8, 6))
for k in ranks:
plt.plot(epsilons, rmse_values[k], label=f'k={k}')
plt.xlabel(r"Noise level ($\epsilon$)")
plt.ylabel("RMSE")
plt.title('RMSE vs. Epsilon for Rank-k Truncated SVD Approximations')
plt.legend()
plt.grid(True)
plt.show()
3 SVD and k-means¶
In [94]:
# Cluster the normalized climate data into 5 clusters using k-means and store
# the vector giving the cluster labels for each location.
X_clusters = KMeans(5).fit(X).labels_
In [95]:
X_with_cluster = pd.concat([X, pd.DataFrame({'cluster':X_clusters})], axis=1)
X_with_cluster.head()
Out[95]:
| min1 | min2 | min3 | min4 | min5 | min6 | min7 | min8 | min9 | min10 | ... | rain4 | rain5 | rain6 | rain7 | rain8 | rain9 | rain10 | rain11 | rain12 | cluster | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2.365957 | 2.200220 | 2.104881 | 1.861889 | 1.545324 | 1.280310 | 1.320737 | 1.604380 | 1.927607 | 2.212659 | ... | 1.053074 | 0.339022 | -0.340384 | -0.661845 | -0.250235 | 0.818142 | 1.240162 | 1.432487 | 1.334708 | 2 |
| 1 | 2.011593 | 1.845426 | 1.694031 | 1.384333 | 1.014846 | 0.706090 | 0.769095 | 1.102594 | 1.416389 | 1.724273 | ... | 0.975600 | 0.215637 | -0.429133 | -0.731620 | -0.357411 | 0.707674 | 1.240162 | 1.416073 | 1.179510 | 2 |
| 2 | 2.288921 | 2.138517 | 2.002169 | 1.762398 | 1.430003 | 1.155479 | 1.195364 | 1.461012 | 1.816472 | 2.127722 | ... | 1.102389 | 0.084431 | -0.508457 | -0.746500 | -0.465501 | 0.573302 | 1.306088 | 1.461285 | 1.122603 | 2 |
| 3 | 2.304328 | 2.169369 | 2.002169 | 1.802194 | 1.476131 | 1.180445 | 1.195364 | 1.461012 | 1.816472 | 2.148956 | ... | 1.368689 | -0.197438 | -0.669222 | -0.805107 | -0.719906 | 0.289803 | 1.393988 | 1.530399 | 1.034657 | 2 |
| 4 | 2.535435 | 2.385330 | 2.276069 | 2.080768 | 1.775967 | 1.529971 | 1.546408 | 1.819431 | 2.149875 | 2.446234 | ... | 1.053074 | -0.001948 | -0.541340 | -0.759524 | -0.527471 | 0.544307 | 1.322569 | 1.403690 | 1.076044 | 2 |
5 rows × 49 columns
In [96]:
coord_cluster = pd.DataFrame({'lat': lat, 'lon':lon, 'cluster':X_clusters})
coord_cluster.head()
Out[96]:
| lat | lon | cluster | |
|---|---|---|---|
| 0 | 38.61 | -29.01 | 2 |
| 1 | 38.61 | -28.44 | 2 |
| 2 | 38.62 | -27.86 | 2 |
| 3 | 38.62 | -27.29 | 2 |
| 4 | 39.07 | -27.87 | 2 |
3a¶
In [97]:
# Plot the results to the map: use the cluster labels to give the color to each
# point.
plot_xy(lon, lat, X_clusters)
3b¶
In [98]:
X_with_cluster.cluster.unique()
Out[98]:
array([2, 0, 3, 4, 1], dtype=int32)
In [99]:
unique_clusters = np.unique(X_clusters)
colors = plt.cm.viridis(np.linspace(0, 1, len(unique_clusters)))
plt.figure(figsize=(8, 6))
for i, cluster_id in enumerate(unique_clusters):
plt.scatter(U[X_clusters == cluster_id, 0], U[X_clusters == cluster_id, 1],
color=colors[i], s=10, alpha=0.7, label=f'Cluster {int(cluster_id)}')
plt.xlabel("First Left Singular Vector")
plt.ylabel("Second Left Singular Vector")
plt.title("Cluster Visualization in SVD Space")
plt.legend(title='Cluster ID', loc='upper right')
plt.show()
3c¶
In [100]:
# Compute the PCA scores, store in Z (of shape N x k)
k = 2 # select the first two components (left singular vectors)
# YOUR PART HERE
Z = U[:, :k] @ np.diag(s[:k]) #pca scores for each example
In [101]:
for k in [1, 2, 3]:
Z = U[:, :k] @ np.diag(s[:k])
# cluster and visualize
Z_clusters = KMeans(5).fit(Z).labels_
# match clusters as well as possible (try without)
Z_clusters = match_categories(X_clusters, Z_clusters)
nextplot()
axs = plt.gcf().subplots(1, 2)
plot_xy(lon, lat, X_clusters, axis=axs[0])
axs[0].set_title("Original data")
plot_xy(lon, lat, Z_clusters, axis=axs[1])
axs[1].set_title(f"PCA $(k={k}$)")